# -*- coding: utf-8 -*-

import urllib2
from bs4 import BeautifulSoup
import codecs
import re
from lib import config
import json
import time
import MySQLdb

def get_html(url, file_op, dbconnect_time = 0):
	html = ''
	try:
		page = urllib2.urlopen(url, timeout=10)
		html = page.read()
	except Exception, e:
		#print Exception,":",e,"|",url
		print >> file_op, Exception,":",e,"|",url+'\n'
		file_op.flush()
		if(dbconnect_time < 1):
			# 暂停五秒
			print >> file_op, 'stop 5 seconds, redbconnect!' + url + "\r\n"
			print 'stop 5 seconds, redbconnect!' + url + "\r\n"
			time.sleep(5)
			dbconnect_time = dbconnect_time + 1
			get_html(url, file_op, dbconnect_time)
		else:
			return ''

	return html

# 将视频录入时间转为Unix时间戳
def datetime_timestamp(dt):
	time_format = '%Y-%m-%d'
	# 加入异常机制，处理传入参数错误的问题
	try:
		s = time.mktime(time.strptime(dt, time_format))
  	except Exception, e:
  		print Exception, e
  		return int(time.time())
	return int(s)

# 处理来源虎牙的直播视频 直播地址：http://www.huya.com/huli666
def pro_onlivedata_from_huya(zbid, f_url_ex, url_pre = 'http://www.huya.com/'):
	url = url_pre + zbid
	print u'huya' + url
	html = get_html(url, f_url_ex)
	vid = ''
	time.sleep(0.2)
	if html.strip() != '':
		soup = BeautifulSoup(html, 'html.parser', from_encoding = 'utf-8')
		# 查找当前直播地址
		v_obj = soup.find(id='flash-link')
		if v_obj:
			return v_obj['value']
		else:
			return ''
# 处理来源于斗鱼的直播视频
def pro_onlivedata_from_douyu(zbid, f_url_ex, url_pre = 'http://staticlive.douyutv.com/common/share/play.swf?room_id='):
	# http://staticlive.douyutv.com/common/share/play.swf?room_id=man520
	# douyu直播 为链接加 roomid，即zbid
	return url_pre + zbid

def getConnection():
	dbconn = MySQLdb.connect(host=config.DB_HOST,user=config.DB_USER,passwd=config.DB_PASSWD,charset=config.DB_CHARSET)
	dbconn.select_db(config.DB_NAME)
	return dbconn 

def main(url):
	onlive_src = 'huomao'
	dbconn = getConnection()
	if onlive_src == 'huya':
		game_type = {'lol':'lol', 'cf':'cf', 'minecraft':'minecraft', 'dota2':'dota2', 'hearthstone':'hearthstone', 'wow':'wow', 'starcraft':'starcraft', '1123':'diablo'}
		file_name = onlive_src + '.log'
		file_err_name = onlive_src + '_err.log'
		file_url_ex = onlive_src + '_pagedown_err.log'
		f = codecs.open(file_name, 'w', 'utf8')
		f_err = codecs.open(file_err_name, 'a+', 'utf8')
		f_url_ex = codecs.open(file_url_ex, 'a+', 'utf8')
		# 虎牙视频列表页api:http://www.huya.com/index.php?m=Live&do=ajaxAllLiveByPage&page=4&pageNum=1
		huya_api_pre = 'http://www.huya.com/index.php?m=Live&do=ajaxAllLiveByPage'
		# 在线人数作为是否继续爬取的判断条件
		top_views = 100
		page = 1
		while page < 200 and top_views > 50:
			cursor = dbconn.cursor()
			huya_api = huya_api_pre + '&page=' + str(page)
			req_body = {}
			req_refer = ''
			response = json_sy.request_ajax_url(huya_api, req_body, req_refer)
			if len(response['data']) != 0:
				res_list = response['data']['list']
				for zb in res_list:
					gameHostName = zb['gameHostName']
					if gameHostName not in game_type:
						continue
					views = int(zb['totalCount'])
					if top_views < zb['totalCount']:
						top_views = zb['totalCount']
					print str(page) + ', views:'+ str(zb['totalCount']) + ',' + gameHostName +'\r\n'
					zbid = zb['privateHost']
					livedata = pro_onlivedata_from_huya(zbid, f_url_ex)
					zbname = zb['nick']
					source = onlive_src
					title = zb['introduction']
					category = game_type[gameHostName]
					inputtime = int(time.time())
					zb_thumb = zb['avatar180']
					thumb = zb['screenshot']
					try:
						cursor.execute("INSERT INTO `sy_video_onlive` (`zbid`, `zbname`, `source`, `title`, `views`, `category`, `isOnlive`, `inputtime`, `zb_thumb`, `thumb`, `livedata`) VALUES (%s, %s, %s, %s, %s, %s, 1, %s, %s, %s, %s)",(zbid, zbname, source, title, views, category, inputtime, zb_thumb, thumb, livedata))
					except Exception, e2:
						print Exception, e2
						print >> f_err, Exception, e2
					dbconn.commit()
			cursor.close()
			page = page + 1
		f.close()
		f_err.close()
	elif onlive_src == 'douyu':
		game_type = {u'英雄联盟':'lol', u'穿越火线':'cf', u'我的世界':'minecraft', 'DOTA2':'dota2', u'炉石传说':'hearthstone', u'魔兽世界':'wow', u'星际争霸':'starcraft', u'暗黑破坏神3':'diablo'}
		file_name = onlive_src + '.log'
		file_err_name = onlive_src + '_err.log'
		file_url_ex = onlive_src + '_pagedown_err.log'
		f = codecs.open(file_name, 'w', 'utf8')
		f_err = codecs.open(file_err_name, 'a+', 'utf8')
		f_url_ex = codecs.open(file_url_ex, 'a+', 'utf8')
		# 斗鱼视频列表:http://www.douyutv.com/directory/isgame?page=2
		douyu_list_pre = 'http://www.douyutv.com/directory/isgame?page='
		# 在线人数作为是否继续爬取的判断条件
		top_views = 100
		page = 1
		while page < 30 and top_views > 20:
			top_views = 19
			#cursor = dbconn.cursor()
			douyu_list = douyu_list_pre + str(page)
			html = get_html(douyu_list, f_url_ex)
			if html.strip() != '':
				soup = BeautifulSoup(html, 'html.parser', from_encoding = 'utf-8')
				ul_list = soup.find(id = 'item_data').ul
				# #列表页样式：<a href="/mmbly" class="list" title="梦梦君 新赛季台服拯救小学生"><span class="img"><img class="lazy" data-original="http://staticlive.douyutv.com/upload/web_pic/7/423117_1511261509_thumb.jpg" src="http://staticlive.douyutv.com/upload/web_pic/7/423117_1511261509_thumb.jpg" width="320" height="180" style="width: 205px; height: 115.313px; display: block;"></span>
    			#             <div class="mes">
   	 			#               <h1 class="title">梦梦君 新赛季台服拯救小学生</h1>
    			#               <p class="moreMes"><span class="view">4417</span><span class="nnt">梦梦贝莉雅x</span><span class="zbName"><em>英雄联盟</em></span></p>
    			#             </div>
    			#             <div class="shadow" style="width: 205px; height: 115.313px;"> <i class="iconPlay" style="top: 22.6563px; left: 67.5px;"></i> </div>
    			#             <i class="icon_live">正在直播</i>                </a>
				for item in ul_list.findAll('li'):
					gameHostName = item.find(class_='zbName').em.string
					if gameHostName not in game_type:
						continue
					views = item.find(class_='view').string
					# 判断在线人数是否包含‘万’
					if views.find(u'万') > 0:
						pos = views.find(u'万')
						views = int(float(views[0:pos]) * 10000)
					else:
						views = int(views)
					if top_views < views:
						top_views = views
					zbid = item.a['href'][1:]
					livedata = pro_onlivedata_from_douyu(zbid, f_url_ex)
					zbname = item.find(class_='nnt').string
					source = onlive_src
					title = item.find(class_='title').string
					#title = title.encode('gbk', 'ignore').decode('gbk')
					# 处理  ‘gbk’ codec can’t encode character u’\u200e’ in position 43: illegal multibyte sequence问题
					# print title.encode('gbk', 'ignore')
					category = game_type[gameHostName]
					inputtime = int(time.time())
					zb_thumb = ''
					thumb = item.find(class_='img').img['src']
					print str(page) + ', views:'+ str(views) + ',' + gameHostName + ', id:' + zbid +'\r\n'
					try:
						#cursor.execute(
						print >> f,  "INSERT INTO `sy_video_onlive` (`zbid`, `zbname`, `source`, `title`, `views`, `category`, `isOnlive`, `inputtime`, `zb_thumb`, `thumb`, `livedata`) VALUES ('%s', '%s', '%s', '%s', '%s', '%s', 1, '%s', '%s', '%s', '%s');"%(zbid, zbname, source, title, views, category, inputtime, zb_thumb, thumb, livedata)
					except Exception, e2:
						print Exception, e2
						print >> f_err, Exception, e2
					#dbconn.commit()
			#cursor.close()
			page = page + 1
	elif onlive_src == 'panda':
		game_type = {'lol':'lol', 'dota2':'dota2', 'hearthstone':'hearthstone', 'starcraft':'starcraft', 'zhuji':'zhuji'}
		file_name = onlive_src + '.log'
		file_err_name = onlive_src + '_err.log'
		file_url_ex = onlive_src + '_pagedown_err.log'
		f = codecs.open(file_name, 'w', 'utf8')
		f_err = codecs.open(file_err_name, 'a+', 'utf8')
		f_url_ex = codecs.open(file_url_ex, 'a+', 'utf8')
		# panda视频列表页api:http://www.panda.tv/live_lists?status=2&order=person_num&pageno=2
		panda_api_pre = 'http://www.panda.tv/live_lists?status=2&order=person_num&pagenum=30&pageno='
		page = 1
		# 在线人数作为是否继续爬取的判断条件
		top_views = 100
		while page < 200 and top_views > 50:
			top_views = 49
			cursor = dbconn.cursor()
			huya_api = panda_api_pre + str(page)
			req_body = {}
			req_refer = ''
			response = json_sy.request_ajax_url(huya_api, req_body, req_refer)
			if len(response['data']) != 0:
				res_list = response['data']['items']
				for zb in res_list:
					gameHostName = zb['classification']['ename']
					if gameHostName not in game_type:
						continue
					views = int(zb['person_num'])
					if top_views < views:
						top_views = views
					print str(page) + ', views:'+ str(views) + ',top_views:'+ str(top_views) + ',' + gameHostName +'\r\n'
					zbid = zb['id']
					room_key = zb['room_key']
					livedata = pro_onlivedata_from_panda(zbid, room_key, f_url_ex)
					zbname = zb['userinfo']['nickName']
					source = onlive_src
					title = zb['name']
					category = game_type[gameHostName]
					inputtime = int(time.time())
					zb_thumb = zb['userinfo']['avatar']
					thumb = zb['pictures']['img']
					try:
						cursor.execute("INSERT INTO `sy_video_onlive` (`zbid`, `zbname`, `source`, `title`, `views`, `category`, `isOnlive`, `inputtime`, `zb_thumb`, `thumb`, `livedata`) VALUES (%s, %s, %s, %s, %s, %s, 1, %s, %s, %s, %s)",(zbid, zbname, source, title, views, category, inputtime, zb_thumb, thumb, livedata))
					except Exception, e2:
						print Exception, e2
						print >> f_err, Exception, e2
					dbconn.commit()
			cursor.close()
			page = page + 1
		f.close()
		f_err.close()
	elif onlive_src == 'longzhu':
		game_type = {'lol':'lol', 'cf':'cf', 'smite':'smite', 'sc':'starcraft', 'pcconsole':'zhuji', 'minecraft':'minecraft', }
		file_name = onlive_src + '.log'
		file_err_name = onlive_src + '_err.log'
		file_url_ex = onlive_src + '_pagedown_err.log'
		f = codecs.open(file_name, 'w', 'utf8')
		f_err = codecs.open(file_err_name, 'a+', 'utf8')
		f_url_ex = codecs.open(file_url_ex, 'a+', 'utf8')
		# longzhu视频列表页api:http: //api.plu.cn/tga/streams/?game=0&max-results=30&start-index=60&sort-by=top&filter=0
		page_size = 30
		longzhu_api_pre = 'http://api.plu.cn/tga/streams/?game=0&sort-by=top&filter=0&max-results=' + str(page_size) + "&start-index="
		page = 0
		# 在线人数作为是否继续爬取的判断条件
		top_views = 100
		while page < 2 and top_views > 50:
			top_views = 49
			start_index = page * page_size
			cursor = dbconn.cursor()
			longzhu_api = longzhu_api_pre + str(start_index)
			req_body = {}
			req_refer = ''
			response = json_sy.request_ajax_url(longzhu_api, req_body, req_refer)
			if len(response['data']) != 0:
				res_list = response['data']['items']
				for zb in res_list:
					gameHostName = zb['game'][0]['tag']
					if gameHostName not in game_type:
						continue
					views = int(zb['viewers'])
					if top_views < views:
						top_views = views
					print str(page) + ', views:'+ str(views) + ',top_views:'+ str(top_views) + ',' + gameHostName +'\r\n'
					channel = zb['channel']
					zbid = channel['domain']
					room_id = channel['id']
					vid = channel['vid']
					livedata = pro_onlivedata_from_longzhu(vid, room_id,f_url_ex)
					zbname = channel['name']
					source = onlive_src
					title = channel['status']
					category = game_type[gameHostName]
					inputtime = int(time.time())
					zb_thumb = channel['avatar']
					thumb = zb['preview']
					try:
						cursor.execute("INSERT INTO `sy_video_onlive` (`zbid`, `zbname`, `source`, `title`, `views`, `category`, `isOnlive`, `inputtime`, `zb_thumb`, `thumb`, `livedata`) VALUES (%s, %s, %s, %s, %s, %s, 1, %s, %s, %s, %s)",(zbid, zbname, source, title, views, category, inputtime, zb_thumb, thumb, livedata))
					except Exception, e2:
						print Exception, e2
						print >> f_err, Exception, e2
					dbconn.commit()
			cursor.close()
			page = page + 1
		f.close()
		f_err.close()
	elif onlive_src == 'huomao':
		game_type = {'17':'lol', '23':'dota2', '13':'hearthstone', '11':'starcraft', '53':'zhuji'}
		file_name = onlive_src + '.log'
		file_err_name = onlive_src + '_err.log'
		file_url_ex = onlive_src + '_pagedown_err.log'
		f = codecs.open(file_name, 'w', 'utf8')
		f_err = codecs.open(file_err_name, 'a+', 'utf8')
		f_url_ex = codecs.open(file_url_ex, 'a+', 'utf8')
		# 火猫TV视频列表页api: http://www.huomaotv.com/channel/all?ajax=1&p=1
		page_size = 30
		huomao_api_pre = 'http://www.huomaotv.com/channel/all?ajax=1&p='
		page = 1
		# 在线人数作为是否继续爬取的判断条件
		top_views = 100
		while page < 2 and top_views > 30:
			top_views = 29
			cursor = dbconn.cursor()
			huomao_api = huomao_api_pre + str(page)
			req_body = {}
			req_refer = ''
			response = json_sy.request_ajax_url(huomao_api, req_body, req_refer)
			if len(response) != 0:
				res_list = response
				for zb in res_list:
					 # 判断是否在直播
					is_live = zb['is_live']
					if is_live == '0':
						continue
					gameHostName = zb['gid']
					if gameHostName not in game_type:
						continue
					views = int(zb['views'])
					if top_views < views:
						top_views = views
					print str(page) + ', views:'+ str(views) + ',top_views:'+ str(top_views) + ',' + gameHostName +'\r\n'
					zbid = zb['id']
					livedata = pro_onlivedata_from_huomao(zbid,f_url_ex)
					zbname = zb['username']
					source = onlive_src
					title = zb['channel']
					category = game_type[gameHostName]
					inputtime = int(time.time())
					# 更改用户头像尺寸
					zb_thumb = zb['head_img'].replace('small','middle')
					thumb = 'http://www.huomaotv.com' + zb['img']
					try:
						cursor.execute("INSERT INTO `sy_video_onlive` (`zbid`, `zbname`, `source`, `title`, `views`, `category`, `isOnlive`, `inputtime`, `zb_thumb`, `thumb`, `livedata`) VALUES (%s, %s, %s, %s, %s, %s, 1, %s, %s, %s, %s)",(zbid, zbname, source, title, views, category, inputtime, zb_thumb, thumb, livedata))
					except Exception, e2:
						print Exception, e2
						print >> f_err, Exception, e2
					dbconn.commit()
			cursor.close()
			page = page + 1
		f.close()
		f_err.close()


	dbconn.close()

if __name__ == "__main__":
    main('/media/video/renqi/top/lol/sec/mv/cate/all/page/1.html');
